home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
SGI Freeware 2002 November
/
SGI Freeware 2002 November - Disc 2.iso
/
dist
/
fw_glimpse.idb
/
usr
/
freeware
/
src
/
glimpse-3.0
/
index
/
region.c.z
/
region.c
Wrap
C/C++ Source or Header
|
1997-09-09
|
14KB
|
571 lines
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. */
/* From mail received from Bill Camargo and Darren Hardy in June 1994 */
#include <stdio.h>
#include "region.h"
/*
* Exports the following routines. Any filtering/attr-val parsing mechanism
* can be integrated into glimpse and glimpseindex with this interface.
*/
char * /* attrname = */ attr_id_to_name(/* int attrid */);
int /* attrid = */ attr_name_to_id(/* char *attrname */);
int attr_dump_names(/* char *filename */);
int attr_load_names(/* char *filename */);
int attr_free_table(/* void */);
int region_initialize(/* void */);
int region_destroy(/* void */);
int region_create(/* char *filename */);
int /* attrid = */ region_identify(/* int offset_in_file, int len_of_region */);
#if BG_DEBUG
extern int memory_usage;
#endif /* BG_DEBUG*/
#if STRUCTURED_QUERIES
printsp()
{
int x;
printf("stack at %x\n", &x);
}
/*****************************************************************************/
#define ATTR_HASH_TABLE_SIZE 256 /* must be a power of 16=multiple of 4 bits */
#define ATTR_HASH_TABLE_MASK 0xff /* bits that mask off the bits in TABLE_SIZE */
#define ATTR_HASH_STEP_SIZE 2 /* #of nibbles that make up TABLE_SIZE */
attr_element_t *attr_hash_table[ATTR_HASH_TABLE_SIZE];
char **attr_name_table = NULL;
int attr_num = 0;
int attr_maxid = 0;
/* English language characters have all info in lowest 4 bits */
int
attr_hash_index(word, len)
char *word;
int len;
{
int i=0, j, index = 0, temp;
for (i=0; i+ATTR_HASH_STEP_SIZE<=len; i+=ATTR_HASH_STEP_SIZE) {
temp = 0;
for (j=0; j<ATTR_HASH_STEP_SIZE; j++)
temp = (temp << 4) | word[i+j] & 0x0f;
index = (index + temp) & ATTR_HASH_TABLE_MASK;
}
temp = 0;
for (j=0; i+j<len; j++)
temp = (temp << 4) | word[i+j] & 0x0f;
index = (index + temp) & ATTR_HASH_TABLE_MASK;
return index;
}
char *
attr_id_to_name(id)
int id;
{
#if 0
printf("id = %d\n", id);
#endif /*0*/
if ((attr_name_table == NULL) || (id > attr_maxid)) return NULL;
else return attr_name_table[id];
}
/*
* returns the attribute number associated with name, 0 for no attribute --
* NOTE: name may not be null terminated and you are not allowed to alter it.
* called during indexing and search.
*/
int
attr_name_to_id(name, len)
char *name;
int len;
{
int index = attr_hash_index(name, len);
attr_element_t *e = attr_hash_table[index];
#if 0
char c = name[len];
name[len] = '\0';
fprintf(stderr, "attr=%s @ %d?\n", name, index);
fflush(stderr);
name[len] = c;
#endif /*0*/
while(e != NULL) {
if (!strncmp(e->attribute, name, len)) break;
else e = e->next;
}
if (e!=NULL) {
#if 0
fprintf(stderr, "foundid=%d\n", e->attributeid);
#endif /*0*/
return e->attributeid;
}
return 0;
}
/*
* returns the attribute number (> 0) for the attribute "name". It adds the
* name as a newly seen attribute if it doesn't exist already (using #tables).
* called in region_create, which is called during indexing.
*/
attr_insert_name(name, len)
char *name;
int len;
{
int index = attr_hash_index(name, len);
attr_element_t **pe = &attr_hash_table[index], *e;
while(*pe != NULL) {
if (!strcmp((*pe)->attribute, name)) break;
else pe = &(*pe)->next;
}
if (*pe!=NULL) return (*pe)->attributeid;
e = (attr_element_t *)my_malloc(sizeof(attr_element_t));
e->attribute = (char *)my_malloc(len + 2);
strncpy(e->attribute, name, len + 1);
e->attributeid = (++attr_num);
e->next = NULL;
*pe = e;
#if 0
fprintf(stderr, "inserting %s %d\n", name, attr_num);
#endif /*0*/
return e->attributeid;
}
/*
* frees current hash table of attr-value pairs.
* called after dump in indexing, and at end of search (after previous load).
*/
int
attr_free_table()
{
int i;
attr_element_t *e, *temp;
for (i=0; i<ATTR_HASH_TABLE_SIZE; i++) {
e = attr_hash_table[i];
while (e != NULL) {
temp = e->next;
#if BG_DEBUG
memory_usage -= strlen(e->attribute) + 2;
#endif /*BG_DEBUG*/
my_free(e->attribute, 0);
my_free(e, sizeof(attr_element_t));
e = temp;
}
attr_hash_table[i] = NULL;
}
if (attr_name_table != NULL) {
my_free(attr_name_table, sizeof(attr_element_t *) * ATTR_HASH_TABLE_SIZE);
attr_name_table = NULL;
}
return 0;
}
/* Looks for embedded attributes and copies the real attribute into dest */
attr_extract(dest, src)
char *dest, *src;
{
char *oldsrc = src;
check_again:
if (!strncmp("embed<", src, 6) || !strncmp("Embed<", src, 6) || !strncmp("EMBED<", src, 6)) {
src += 6;
while ((*src != '>') && (*src != '\0')) src++;
if (*src == '\0') {
strcpy(dest, oldsrc);
return;
}
while (!isalnum(*src)) src ++; /* assuming type names are .. */
oldsrc = src;
goto check_again;
}
strcpy(dest, src);
return;
}
/*
* dumps the attribute-list into a file name (id, name, \n)
* into the file specified and then destroys the hash table.
* Returns #of attributes dumped into the file, -1 if error.
* called at the end of indexing.
*/
int
attr_dump_names(filename)
char *filename;
{
int i=0;
int ret = -1;
FILE *fp = fopen(filename, "w");
attr_element_t *e;
#if 0
printf("in dump attr\n");
#endif /*0*/
if (fp == NULL) return -1;
ret = 0;
for (i=0; i<ATTR_HASH_TABLE_SIZE; i++) {
e = attr_hash_table[i];
while (e != NULL) {
fprintf(fp, "%d,%s ", e->attributeid, e->attribute);
e = e->next;
ret ++;
}
fputc('\n', fp);
}
fflush(fp);
fclose(fp);
return ret;
}
/*
* constructs a hash-table of attributes by reading them from the file.
* Returns #of attributes read from the file, -1 if error.
* Does not recompute hash-indices of attributes.
* called before searching for attr=val pairs.
*/
int
attr_load_names(filename)
char *filename;
{
int index = 0, ret = 0;
FILE *fp = fopen(filename, "r");
attr_element_t *e;
int c = 0;
char temp[1024]; /* max attr name */
char buffer[1024+32];/* max attr id pair */
int i;
int id;
attr_maxid = 0;
memset(attr_hash_table, '\0', sizeof(attr_element_t *) * ATTR_HASH_TABLE_SIZE);
if (fp == NULL) return -1;
while ((c = getc(fp)) != EOF) {
if (c == '\n') {
index ++;
continue;
}
ungetc(c, fp);
/* fscanf screws up fp and skips over trailing space characters (\t,\n, ) */
i=0;
while ((c=getc(fp)) != ' ') buffer[i++] = c;
buffer[i] = '\0';
#if 0
printf("buffer=%s\n", buffer);
#endif /*0*/
sscanf(buffer, "%d,%1023s", &id, temp);
temp[1023] = '\0';
#if 0
printf("read attr=%s,%d @ %d\n", temp, id, index);
#endif /*0*/
if (id <= 0) continue;
e = (attr_element_t *)my_malloc(sizeof(attr_element_t));
e->attributeid = id;
if (id > attr_maxid) attr_maxid = id;
e->attribute = (char *)my_malloc(strlen(temp) + 2);
strcpy(e->attribute, temp);
e->next = attr_hash_table[index];
attr_hash_table[index] = e;
ret ++;
if (index >= ATTR_HASH_TABLE_SIZE - 1) break;
}
fclose(fp);
attr_name_table = (char **)my_malloc(sizeof(char *) * (ret=(ret >= (attr_maxid + 1) ? ret : (attr_maxid + 1))));
memset(attr_name_table, '\0', sizeof(char *) * ret);
for (i=0; i<ATTR_HASH_TABLE_SIZE; i++) {
e = attr_hash_table[i];
while (e!=NULL) {
attr_name_table[e->attributeid] = e->attribute;
e = e->next;
}
}
return ret;
}
/***************************************************************************/
region_t *current_regions, *nextpos; /* nextpos is hint into list */
/*
* Called during indexing before region_create.
* returns 0.
*/
int
region_initialize()
{
attr_num = 0;
attr_name_table = NULL;
memset(attr_hash_table, '\0', sizeof(attr_element_t *) * ATTR_HASH_TABLE_SIZE);
current_regions = nextpos = NULL;
return 0;
}
/*
* creates a data structure containing the list of attributes
* which occur at increasing offsets in the given file -- future
* region_identify() calls use the "current" data structure.
* returns 0 if success, -1 if it cannot open the file.
*/
int
region_create(name)
char *name;
{
FILE *fp;
AVList *al;
region_t *prl, *rl, *lastrl;
Template *t;
char temp[1024];
current_regions = nextpos = NULL;
if ((fp = fopen(name, "r")) == NULL) return -1;
init_parse_template_file(fp);
lastrl = NULL;
while ((t = parse_template()) != NULL) {
/* do insertion sort of list returned by parse_template using offsets */
if ((t->url != NULL) && (strlen(t->url) > 0)) {
rl = (region_t *)my_malloc(sizeof(region_t));
/* Darren Hardy's Voodo :-) */
/* The SOIF looks like this: @TTYPE { URL\n */
/* t->offset points to the @ */
/* rl->offset points to the space before URL */
/* rl->length includes the entire URL */
rl->offset = t->offset + strlen(t->template_type) + 3;
rl->length = strlen(t->url) + 1;
rl->attributeid = attr_insert_name("url", 3);
if ((lastrl != NULL) && (lastrl->offset <= rl->offset)) { /* go forward */
prl = lastrl;
while (prl->next != NULL) {
if (prl->next->offset > rl->offset) {
rl->prev = prl;
rl->next = prl->next;
prl->next->prev = rl;
prl->next = rl;
lastrl = rl;
break;
}
else prl = prl->next;
}
if (prl->next == NULL) {
rl->next = NULL;
rl->prev = prl;
prl->next = rl;
lastrl = rl;
}
}
else { /* must go backwards and find the right place to insert */
prl = lastrl;
while (prl != NULL) {
if (prl->offset < rl->offset) {
rl->prev = prl;
rl->next = prl->next;
if (prl->next != NULL)
prl->next->prev = rl;
prl->next = rl;
lastrl = rl;
break;
}
else prl = prl->prev;
}
if (prl == NULL) {
rl->next = current_regions;
if (current_regions != NULL) current_regions->prev = rl;
rl->prev = NULL;
current_regions = rl;
lastrl = rl;
}
}
#if 0
printf("region url=[%d,%d]\n", rl->offset, rl->offset+rl->length);
#endif /*0*/
}
al = t->list;
while(al != NULL) {
rl = (region_t *)my_malloc(sizeof(region_t));
rl->offset = al->data->offset;
rl->length = al->data->vsize;
attr_extract(temp, al->data->attribute);
rl->attributeid = attr_insert_name(temp, strlen(temp));
if ((lastrl != NULL) && (lastrl->offset <= rl->offset)) { /* go forward */
prl = lastrl;
while (prl->next != NULL) {
if (prl->next->offset > rl->offset) {
rl->prev = prl;
rl->next = prl->next;
prl->next->prev = rl;
prl->next = rl;
lastrl = rl;
break;
}
else prl = prl->next;
}
if (prl->next == NULL) {
rl->next = NULL;
rl->prev = prl;
prl->next = rl;
lastrl = rl;
}
}
else { /* must go backwards and find the right place to insert */
prl = lastrl;
while (prl != NULL) {
if (prl->offset < rl->offset) {
rl->prev = prl;
rl->next = prl->next;
if (prl->next != NULL)
prl->next->prev = rl;
prl->next = rl;
lastrl = rl;
break;
}
else prl = prl->prev;
}
if (prl == NULL) {
rl->next = current_regions;
if (current_regions != NULL) current_regions->prev = rl;
rl->prev = NULL;
current_regions = rl;
lastrl = rl;
}
}
#if 0
printf("region %s=[%d,%d]\n", al->data->attribute, rl->offset, rl->offset+rl->length);
#endif /*0*/
al = al->next;
}
free_template(t);
}
finish_parse_template();
nextpos = current_regions;
fclose(fp);
return 0;
}
/*
* frees the data structure created for the current file above.
* returns 0.
*/
int
region_destroy()
{
region_t *rl = current_regions, *trl;
while (rl != NULL) {
trl = rl;
rl = rl->next;
free(trl, sizeof(region_t));
}
current_regions = nextpos = NULL;
return 0;
}
/*
* returns attribute number [1..num_attr] which covers (inclusive)
* the region * [offset, offset+len] in the "current" file, 0 if none.
* called during indexing after region_create, and search after
* attr_load_names. Do not need sophisticated interval trees here!
*/
int
region_identify(offset, len)
int offset, len;
{
region_t *rl;
if (nextpos == NULL) nextpos = current_regions;
rl = nextpos;
while (rl!=NULL) {
if (rl->offset > offset + len)
goto backwards; /* definitely before: can be earlier region OR hole */
else if ((rl->offset <= offset) && (rl->offset + rl->length >= offset + len))
return rl->attributeid; /* definitely within */
else if (rl->offset + rl->length < offset)
nextpos = rl = rl->next; /* definitely after: later region */
else return 0; /* overlapping: error */
}
return 0; /* reached end of file */
backwards:
while (rl!=NULL) {
if (rl->offset > offset + len)
nextpos = rl = rl->prev; /* definitely before: earlier region */
else if ((rl->offset <= offset) && (rl->length + rl->length >= offset + len))
return rl->attributeid; /* definitely within */
else if (rl->offset + rl->length < offset)
return 0; /* hole */
else return 0; /* overlapping: error */
}
return 0; /* reached end of file */
}
#else /*STRUCTURED_QUERIES*/
int attr_num = 0;
char *attr_id_to_name(id)
int id;
{
return NULL;
}
int attr_name_to_id(name)
char *name;
{
return 0;
}
int attr_dump_names(name)
char *name;
{
return 0;
}
int attr_load_names(name)
char *name;
{
return 0;
}
int attr_free_table()
{
return 0;
}
int region_initialize()
{
return 0;
}
int region_desrtroy()
{
return 0;
}
int region_create(name)
char *name;
{
return 0;
}
int region_destroy()
{
return 0;
}
int region_identify(offset, len)
int offset, len;
{
return 0;
}
#endif /*STRUCTURED_QUERIES*/